###########################################
d2020 <- readRDS("raw_data/mn_death_data/MN mortality.rds")

d2020 <- d2020 |> 
  mutate(death_date = as.Date(death_date, "%m/%d/%Y")) |>
  select(starts_with("residence"), "decd_age_in_years",
         decd_first_name, decd_last_name, birth_date, covid, death_date,
         subject_not_hispanic, subject_white, subject_african_american) |> 
  filter(year(death_date) %in% c(2020, 2021))

d2020 <- as.data.table(d2020)

d2020 <- mutate(d2020,
                residence_street = toupper(gsub('[[:punct:] ]+', ' ', residence_street)))

d2020 <- filter(d2020, !grepl("UNKNOWN", residence_street),
                !grepl("P O BOX", residence_street),
                !grepl("PO BOX", residence_street),
                !grepl("POBOX", residence_street),
                !grepl("HOMELESS", residence_street),
                !grepl("TRANSIENT", residence_street),
                !grepl("NO PERMANENT ADDRESS", residence_street),
                trimws(residence_street) != "",
                residence_state == "MINNESOTA")

d2020 <- mutate(d2020, residence_street = trimws(gsub("\\s+", " ", residence_street)))

d2020 <- mutate(d2020, residence_street = gsub(" APT ", " ", residence_street))
d2020 <- mutate(d2020, residence_street = gsub(" APARTMENT ", " ", residence_street))

ad_cl <- fread("raw_data/address_cleaner.csv") |> 
  filter(search != replace) |> 
  mutate(across(c(search, replace), toupper)) |> 
  distinct()

for(i in c(1:nrow(ad_cl))){
  print(i)
  d2020$residence_street <- gsub(paste0("(^|\\s)", ad_cl$search[i], "(\\s|$)"),
                                 paste0("\\1", ad_cl$replace[i], "\\2"),
                                 d2020$residence_street)
}

d2020 <- d2020 |> 
  mutate(across(c("decd_first_name", "decd_last_name"),
                ~ gsub("[[:punct:]]| ", "", ifelse(. == "", NA, tolower(iconv(., "WINDOWS-1252", "UTF-8"))))),
         residence_city = toupper(gsub("[[:punct:]]", "", residence_city)))
d2020 <- labelled::remove_attributes(d2020, c("label", "format.stata"))


d2020$residence_street <- gsub("NORTH", "N", d2020$residence_street)
d2020$residence_street <- gsub("SOUTH", "S", d2020$residence_street)
d2020$residence_street <- gsub("EAST", "E", d2020$residence_street)
d2020$residence_street <- gsub("WEST", "W", d2020$residence_street)

d2020 <- d2020 |> 
  extract(residence_street, into = c("num", "rest"), "(.*?)\\s+(.*)", remove = FALSE)

saveRDS(d2020, "temp/cleaned_death_mn.rds")

d2020 <- readRDS("temp/cleaned_death_mn.rds")
#####################################

k <- readRDS("temp/cleaned_file_mn.rds")

k <- k |> 
  mutate(across(c("Residence_Addresses_City"),
                ~ toupper(gsub("[[:punct:]]", "", .))))

k$street <- gsub("NORTH", "N", k$street)
k$street <- gsub("SOUTH", "S", k$street)
k$street <- gsub("EAST", "E", k$street)
k$street <- gsub("WEST", "W", k$street)
k$street <- gsub(" APARTMENT ", " ", k$street)
k$street <- gsub(" APT ", " ", k$street)

k <- filter(k,
            !(paste0(Voters_BirthDate,
                     street, Residence_Addresses_City)) %in% 
              with(d2020, paste0(birth_date, residence_street, residence_city)))

k <- k |> 
  extract(street, into = c("num", "rest"), "(.*?)\\s+(.*)", remove = FALSE)


k$rest <- paste(k$rest, k$Residence_Addresses_City)

d2020 <- d2020 |> 
  mutate(death_date = as.Date(death_date, "%m/%d/%Y"),
         rest = paste(rest, residence_city)) |> 
  filter(death_date <= "2020-12-31" | death_date >= "2021-05-03")

good <- filter(d2020, rest %in% k$rest)

bad <- filter(d2020, !(rest %in% k$rest))

bad_ads <- bad |> 
  group_by(rest) |> 
  tally()

bad_ads$best <- amatch(bad_ads$rest,
                       k$rest,
                       maxDist = 3, method = "lv")

bad_ads$new <- k$rest[bad_ads$best]

bad_ads$dist <- stringdist(bad_ads$rest, bad_ads$new, method = "lv")

bad <- left_join(bad, bad_ads) |> 
  select(-best, -n)

good <- good |> 
  mutate(dist = 0,
         new = rest)

full <- bind_rows(good, bad)

saveRDS(full, "temp/mn_death_with_dists.rds")

#######################################################
#######################################################
#######################################################

one_per <- readRDS("temp/mn_death_with_dists.rds") |> 
  mutate(death_date = as.Date(death_date, "%m/%d/%Y"),
         pre = death_date < "2021-05-03",
         white = (subject_white == "Y") & (subject_not_hispanic == "Y"),) |> 
  filter(death_date <= "2020-12-31" | death_date >= "2021-05-03",
         dist == 0) |> 
  group_by(residence_street, residence_city, pre) |> 
  summarize(across(c(death_date), mean),
            age = mean(decd_age_in_years),
            covid = max(covid),
            n_dead = n(),
            last_name = min(decd_last_name),
            white = min(white))

tre <- filter(one_per, pre)

pra <- filter(one_per, !pre)


#####################################

k_m <- left_join(k, tre,
               by = c("street" = "residence_street",
                      "Residence_Addresses_City" = "residence_city"))

k_m <- mutate(k_m,
            n_dead = ifelse(is.na(n_dead), 0, n_dead),
            treated = ifelse(is.na(covid), 0,
                             ifelse(covid, 2, 1)))

saveRDS(filter(k_m, treated > 0), "temp/vf_mn.rds")


k_m2 <- k_m |> 
  filter(treated == 0) |> 
  select(-death_date, -age, -covid, -n_dead,
         -last_name, -treated, -num, -rest, -pre, -white)


k_m2 <- inner_join(k_m2, pra,
               by = c("street" = "residence_street",
                      "Residence_Addresses_City" = "residence_city"))

saveRDS(k_m2, "temp/vf_later_mn.rds")


cleanup("k")
#################################################
#################################################
#################################################


one_per <- readRDS("temp/mn_death_with_dists.rds") |> 
  mutate(death_date = as.Date(death_date, "%m/%d/%Y"),
         pre = death_date < "2021-05-03",
         white = (subject_white == "Y") & (subject_not_hispanic == "Y"),) |> 
  filter(death_date <= "2020-12-31" | death_date >= "2021-05-03",
         dist < 3, !is.na(dist)) |> 
  mutate(rest = new) |> 
  group_by(num, rest, pre) |> 
  summarize(across(c(death_date), mean),
            age = mean(decd_age_in_years),
            covid = max(covid),
            n_dead = n(),
            last_name = min(decd_last_name),
            white = min(white))

tre <- filter(one_per, pre)

pra <- filter(one_per, !pre)


#####################################

k_m <- left_join(k, tre,
                 by = c("num", "rest"))

k_m <- mutate(k_m,
              n_dead = ifelse(is.na(n_dead), 0, n_dead),
              treated = ifelse(is.na(covid), 0,
                               ifelse(covid, 2, 1)))

saveRDS(filter(k_m, treated > 0), "temp/vf_mn_dist.rds")


k_m2 <- k_m |> 
  filter(treated == 0) |> 
  select(-death_date, -age, -covid, -n_dead,
         -last_name, -treated, -pre, -white)


k_m2 <- inner_join(k_m2, pra,
                   by = c("num", "rest"))

saveRDS(k_m2, "temp/vf_later_mn_dist.rds")
